### SURVIVAL ANALYSIS WITH HHI AND POPULATION VARIABLES

library(dplyr)
library(survival)
library(moments)
library(modelsummary)
library(flextable)
library(car)  # For VIF calculation

# Define services in the desired order
services <- c("Waste_treatment", "Waste_collection", "Transport", "Fire", "Libraries",
              "Civil_protection", "Drinking_water", "Sewer")

# Load data with HHI
data <- readRDS("table6_data.rds")

transform_skewed_data_cox <- function(data) {
  # Function to handle skewness transformation
  transform_variable <- function(x) {
    skew <- skewness(x, na.rm = TRUE)
    
    if (abs(skew) > 1) {
      if (skew > 0) {
        # For positive skew, try log transformation if all values are positive
        if (min(x, na.rm = TRUE) > 0) {
          x_transformed <- log(x)
        } else {
          # If there are zero or negative values, shift the data
          x_shifted <- x - min(x, na.rm = TRUE) + 1
          x_transformed <- log(x_shifted)
        }
      } else {
        # For negative skew, try exponential transformation
        x_transformed <- exp(scale(x))
      }
      return(as.vector(scale(x_transformed)))
    } else {
      return(as.vector(scale(x)))
    }
  }
  
  # Apply transformations and scaling - INCLUDING HHI AND POPULATION
  data %>%
    mutate(
      population = transform_variable(population),
      population_squared = transform_variable(I(population^2)),
      VOTANTS_PERCENT = transform_variable(VOTANTS_PERCENT),
      EUROS_HABITANT = transform_variable(EUROS_HABITANT),
      hhi = transform_variable(-1 * hhi),  # MULTIPLY HHI BY -1 FOR INVERTED INTERPRETATION
      year = factor(year)
    )
}

# Apply transformation to your data
data_rescaled <- transform_skewed_data_cox(data)

# Create survival data with transformed variables
survival_data <- data_rescaled %>%
  group_by(muni, id, service) %>%
  mutate(
    time = as.numeric(as.character(year)) - 2009,
    event = imc_dummy
  ) %>%
  slice(if(any(event == 1)) {
    which.max(event)
  } else {
    n()
  }) %>%
  ungroup()

# Function to calculate VIF for Cox models
calculate_vif_cox <- function(data, formula_vars, service_name) {
  tryCatch({
    service_data <- data %>% filter(service == service_name)
    
    if(nrow(service_data) < 2) return(NULL)
    
    # Create design matrix for the predictors
    predictor_data <- service_data[, formula_vars, drop = FALSE]
    
    # Remove any rows with missing values
    predictor_data <- predictor_data[complete.cases(predictor_data), , drop = FALSE]
    
    if(nrow(predictor_data) < 2 || ncol(predictor_data) < 2) return(NULL)
    
    # Calculate correlation matrix and VIF
    cor_matrix <- cor(predictor_data)
    if(any(is.na(cor_matrix)) || det(cor_matrix) == 0) return(NULL)
    
    vif_values <- diag(solve(cor_matrix))
    names(vif_values) <- formula_vars
    
    return(vif_values)
  }, error = function(e) {
    message("Error calculating VIF for ", service_name, ": ", e$message)
    return(NULL)
  })
}

# VIF Analysis
cat("=== VIF ANALYSIS FOR POPULATION + HHI MODEL (HHI INVERTED) ===\n")

# Model specification: Population + Population² + HHI + controls
model_vars <- c("population", "population_squared", "hhi", "EUROS_HABITANT", "VOTANTS_PERCENT")

# Store VIF results
vif_results <- list()

# Calculate VIF for each service
for(service in services) {
  cat("\n=== VIF ANALYSIS FOR", service, "===\n")
  
  vif_vals <- calculate_vif_cox(survival_data, model_vars, service)
  if(!is.null(vif_vals)) {
    vif_results[[service]] <- vif_vals
    print(round(vif_vals, 3))
    
    # Flag high VIF values
    high_vif <- vif_vals[vif_vals > 5 & !is.na(vif_vals)]
    if(length(high_vif) > 0) {
      cat("*** WARNING: High VIF detected (>5):\n")
      print(round(high_vif, 3))
    }
    very_high_vif <- vif_vals[vif_vals > 10 & !is.na(vif_vals)]
    if(length(very_high_vif) > 0) {
      cat("*** SEVERE WARNING: Very high VIF detected (>10):\n")
      print(round(very_high_vif, 3))
    }
  } else {
    cat("VIF calculation failed\n")
  }
}

# VIF Summary Table
if(length(vif_results) > 0) {
  cat("\n=== VIF SUMMARY TABLE ===\n")
  vif_df <- do.call(rbind, lapply(names(vif_results), function(service) {
    vif_vals <- vif_results[[service]]
    data.frame(
      Service = service,
      Variable = names(vif_vals),
      VIF = round(vif_vals, 3),
      stringsAsFactors = FALSE
    )
  }))
  print(vif_df)
  cat("Mean VIF:", round(mean(vif_df$VIF, na.rm = TRUE), 3), "\n")
  cat("Max VIF:", round(max(vif_df$VIF, na.rm = TRUE), 3), "\n")
  cat("Variables with VIF > 5:", sum(vif_df$VIF > 5, na.rm = TRUE), "\n")
}

# MODEL FUNCTION: Population + Population² + HHI
run_cox_model_hhi <- function(data, service_name) {
  tryCatch({
    service_data <- data %>%
      filter(service == service_name)
    
    if(nrow(service_data) < 2) return(NULL)
    if(sum(service_data$event) == 0) return(NULL)
    
    surv_obj <- Surv(time = service_data$time, 
                     event = service_data$event)
    
    cox_model <- coxph(surv_obj ~ population +
                       population_squared + 
                       hhi +
                       EUROS_HABITANT + 
                       VOTANTS_PERCENT, 
                       model = TRUE,
                       data = service_data)
    
    return(cox_model)
  }, error = function(e) {
    message("Error in model for service: ", service_name)
    message("Error message: ", e$message)
    return(NULL)
  })
}

# Run models for each service
cat("\n\n=== FITTING MODELS WITH HHI AND POPULATION (HHI INVERTED) ===\n")
models_hhi <- setNames(
  lapply(services, function(s) {
    cat("Fitting model for:", s, "\n")
    model <- run_cox_model_hhi(survival_data, s)
    if(!is.null(model)) {
      cat("Model successfully fit for:", s, "\n")
    } else {
      cat("Model failed for:", s, "\n")
    }
    return(model)
  }),
  services
)

# Remove NULL models
models_hhi_clean <- models_hhi[!sapply(models_hhi, is.null)]

# Coefficient mapping for HHI models (updated label to reflect inversion)
cm_hhi <- c(
  '(Intercept)' = 'Intercept',
  'population' = 'Population',
  'population_squared' = 'Population²',
  'hhi' = 'Political Concentration (HHI - Inverted)',
  'VOTANTS_PERCENT' = 'Turnout %',
  'EUROS_HABITANT' = 'Debt p.c.'
)

# Create summary table
if(length(models_hhi_clean) > 0) {
  cat("\n=== MODEL SUMMARY WITH HHI (INVERTED) ===\n")
  cat("Note: HHI has been multiplied by -1 for more straightforward interpretation\n")
  cat("Higher values now indicate LOWER concentration (more competition)\n\n")
  
  modelsummary(models_hhi_clean, stars = TRUE, coef_map = cm_hhi)
  
  modelsummary(
    models_hhi_clean,
    output = "cox_model_population_hhi_inverted.docx",
    coef_map = cm_hhi,
    stars = TRUE,
    title = "Cox Model: Population + Population² + Political Concentration (HHI - Inverted)",
    notes = "Note: HHI values multiplied by -1. Higher values indicate lower concentration."
  )
}

cat("\n=== FINAL SUMMARY ===\n")
cat("Models successfully fit for:", length(models_hhi_clean), "services\n")
cat("VIF Analysis completed\n")
cat("HHI values inverted (multiplied by -1) for easier interpretation\n")
cat("File created: cox_model_population_hhi_inverted.docx\n")
